set.seed(2)
required_packages <- c("tidyverse", "magrittr", "DBI", "bigrquery", "arrow","glue", "vroom","janitor", "gt", "ggwordcloud", "readxl", "ggthemes", "hrbrthemes", "extrafont", "plotly", "scales", "stringr", "gganimate", "here", "tidytext", "sentimentr", "scales", "DT", "here", "sm", "mblm", "glue", "fs", "knitr", "rmdformats", "janitor", "urltools", "colorspace", "pdftools", "showtext", "pander")
for(i in required_packages) {
if(!require(i, character.only = T)) {
# if package is not existing, install then load the package
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
panderOptions('table.alignment.default', "left")
## quality of png's
dpi <- 750
## theme updates; please adjust to client´s website
#theme_set(ggthemes::theme_clean(base_size = 15))
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 20,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
bl_dark <- darken(bl_col, .3, space = "HLS")
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)
Basic stats
SELECT COUNT(keyword_info_search_volume) as `total_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
total_count <- sql$total_count
display_number(total_count, "Total number of different searches")
SELECT SUM(COALESCE(keyword_info_search_volume / 10000, 0)) AS total_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
# Calculated in a roundabout way to avoid integer overflow
total_volume <- sql$total_volume * 10000
tibble("Total number of searches" = glue("~{format(round(total_volume / 1000000), big.mark = ',')} million")) %>%
pander()
This table shows the top 10 searches. They are almost all spelling errors. As in, they are not really searched, but rather people attempting to go to Youtube or Facebook, but typing it wrong. Oddly they are all attributed as having a search volume of exactly 185 million.
top %>%
select(keyword, location, spell, spell_type, keyword_info_search_volume) %>%
head(10) %>%
gt() %>%
tab_options(table.align = "left") %>%
tab_header("Top searches")
| keyword |
location |
spell |
spell_type |
keyword_info_search_volume |
| face bak |
2840 |
facebook |
did_you_mean |
185000000 |
| yput |
2840 |
youtube |
did_you_mean |
185000000 |
| xyoutube |
2840 |
youtube |
showing_results_for |
185000000 |
| utub |
2840 |
youtube |
did_you_mean |
185000000 |
| yioutub |
2840 |
youtube |
showing_results_for |
185000000 |
| fa9ebook |
2840 |
facebook |
showing_results_for |
185000000 |
| ioyoutube |
2840 |
youtube |
showing_results_for |
185000000 |
| kn youtube |
2840 |
on |
did_you_mean |
185000000 |
| the only youtube |
2840 |
|
|
185000000 |
| youtbe == |
2840 |
youtube |
did_you_mean |
185000000 |
SELECT COUNT(*) as `missing_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
missing_count <- sql$missing_count
tibble("Missing search volume" = scales::percent(missing_count / total_count, accuracy = 0.001)) %>% pander()
The missing have some searches that are likely high volume. Thus they are truly missing, and not just 0s.
SELECT keyword, keyword_info_search_volume
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume IS NULL
ORDER BY RAND()
LIMIT 10
sql %>% gt() %>% tab_options(table.align = "left") %>%
tab_header("Keywords with missing search volume")
| keyword |
keyword_info_search_volume |
| Crain's HCSC compensation |
NA |
| Glycine sweetener |
NA |
| South Carolina Republican primary 2020 polls |
NA |
| Jo Malone 針管香水 評價 |
NA |
| Starting Strength program |
NA |
| DeWalt Mini Circular saw |
NA |
| Juilliard contact |
NA |
| Garden soil delivery |
NA |
| Child and Family Center jobs |
NA |
| Best entry indicator MT4 |
NA |
SELECT COUNT(*) as `zero_count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
AND keyword_info_search_volume = 0
zero_count <- sql$zero_count
tibble("Searches with search volume 0" = scales::percent(zero_count / total_count, accuracy = 0.001)) %>% pander()
display_number(total_volume / total_count, "Mean search volume")
SELECT approx_quantiles(keyword_info_search_volume, 2)[offset(1)] AS `median`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
display_number(sql$median, "Median search volume")
SELECT AVG(`keyword_info_cpc`) AS `mean_cpc`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE `location` = 2840
display_number(sql$mean_cpc, "Mean CPC")
SELECT approx_quantiles(keyword_info_cpc, 2)[offset(1)] AS `median_cpc`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
display_number(sql$median_cpc, "Median CPC")
Search tails
volume_top <- top %>%
add_rownames() %>%
mutate(rowname = as.numeric(rowname)) %>%
select(rowname, volume = keyword_info_search_volume)
ylab <- c(50, 100, 150, 200)
volume_top %>%
mutate(cat = case_when(
rowname < 500 ~ "Top 500",
rowname < 1000 ~ "Top 1000",
rowname < 2000 ~ "Top 2000",
rowname < 5801 ~ "Top 0.001%",
T ~ "Remaining 99.99%"
)) %>%
mutate(cat = factor(cat, levels = c("Top 500", "Top 1000", "Top 2000", "Top 0.001%", "Remaining 99.99%"))) %>%
head(10500) %>%
ggplot(aes(x = rowname, y = volume, fill = cat)) +
geom_area(alpha = 0.8) +
scale_y_continuous(
labels = glue("{ylab} M"),
breaks = 10^6 * ylab,
limits = c(0, 200* 10^6),
expand = c(0,0)
) +
labs(x = "", title = "Long tail") +
geom_segment(aes(x = 8000, y = 35*10^6, xend = 10500, yend = 35*10^6),
arrow = arrow(length = unit(0.5, "cm"))) +
scale_x_continuous(expand = c(0,0))

get_count_range <- function(lower, higher)
{
sql <- glue(
"SELECT COUNT(*) AS `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume >= {lower}
AND keyword_info_search_volume <= {higher}")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb)$count
}
df <- tribble(
~cat, ~count,
"0 - 10", get_count_range(0, 10),
"11- 100", get_count_range(11, 100),
"101 - 1000", get_count_range(101, 1000),
"1001 - 10000", get_count_range(1001, 10000),
"10001 - 100000", get_count_range(10001, "100000"),
"100001+", get_count_range("100001", "100000000000"))
df %>%
mutate(count = count / sum(count)) %>%
ggplot(aes(x = reorder(cat, desc(count)), y = count)) +
geom_bar(stat = "identity", fill = "turquoise4", color = "black") +
theme_light() +
labs(x = "", y = "", title = "Long tail") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0,0))
write_length_volume <- function()
{
get_length_volume <- function(l)
{
sql <- glue(
"SELECT sum(keyword_info_search_volume) / 10000 as `volume`, count(keyword_info_search_volume) as `count`
FROM `dataforseo-bigquery.dataforseo_data.keyword_data`
WHERE location = 2840
AND keyword_info_search_volume IS NOT NULL
AND LENGTH(keyword) = {l}")
tb <- bq_project_query("dataforseo-bigquery", sql)
bq_table_download(tb) %>% mutate(length = l)
}
df <- map_df(1:50, get_length_volume)
write_csv(df, "../proc_data/keyword_length_volume.csv")
}
df <- read_csv("../proc_data/keyword_length_volume.csv")
df %>% mutate(
volume = 10000 * volume,
ratio = volume / count
) %>%
ggplot(aes(x = length, y = ratio)) +
geom_line(color = "turquoise4", size = 1) + geom_point(color = "turquoise4") +
theme_light() +
labs(x = "", y = "", title = "Average search volume by keyword length")
